import warnings
warnings.filterwarnings("ignore")
import scanpy as sc #software suite of tools for single-cell analysis in python
import besca as bc #internal BEDA package for single cell analysis
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import anndata as ad
from scipy.sparse import csr_matrix
import scanpy.external as sce
from harmony import harmonize
import umap.umap_ as umap
print(ad.__version__)
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmp3dalgwgk INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmp3dalgwgk/_remote_module_non_scriptable.py INFO:lightning_fabric.utilities.seed:Global seed set to 0
0.9.1
Displaying result settings required for single cell analysis
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
scanpy==1.9.3 anndata==0.9.1 umap==0.3.10 numpy==1.23.5 scipy==1.10.1 pandas==2.0.1 scikit-learn==1.2.2 statsmodels==0.14.0 python-igraph==0.10.4 louvain==0.8.0 pynndescent==0.5.10
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/_settings.py:447: DeprecationWarning: `set_matplotlib_formats` is deprecated since IPython 7.23, directly use `matplotlib_inline.backend_inline.set_matplotlib_formats()`
Loading all PBMC datasets processed by Pipeline-Version: cellranger-7.1.0 into the workspace
PBMCsarc1: SAM24412250-Sarcoidosis_Donor1_PBMC-male-57yrs-white from Genentech (10x Genomics Chromium v3.1 3’ NovaSeq 6000)
PBMCsarc2: SAM24412252 Sarcoidosis_Donor2_PBMC: male-35yrs-southasisan sequenced by Genentech (10x Genomics Chromium v3.1 3’ NovaSeq 6000)
PBMCsarc3: SAM24412252 Sarcoidosis_Donor3_PBMC: female-60yrs-white sequenced by Genentech (10x Genomics Chromium v3.1 3’ NovaSeq 6000)
PBMChealthy1: SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K_h1: healthy female-19yrs from 10x Genomics database (10x Genomics Chromium v3.1 3’ NovaSeq 6000). For more info link
PBMChealthy2: 5k_pbmc_v3_nextgem_fastqs_h2 from 10x Genomics database a healthy donor (gender not specified) (10x Genomics Chromium v3.1 3’ NovaSeq 6000). For more info Link
PBMChealthy3: 3p_Citrate_CPT_fastqs_h3: Healthy female from 10x Genomics database (10x Genomics Chromium v3.1 3’ NovaSeq 6000). For more info Link
PBMChealthy4: 10k_PBMC_3p_nextgem_Chromium_X_fastqs_h4: Healthy female-25-30yrs (10x Genomics Chromium v3.1 3’ NovaSeq 6000). For more info Link
#Load Disease PBMC dataset1 for sarcoidosis
PBMCsarc1=sc.read_10x_mtx('/raid02/Data-live/tjana/LIB5455299_SAM24412250/outs/filtered_feature_bc_matrix/',
var_names='gene_symbols',cache=True)
#Load Disease PBMC dataset2 for sarcoidosis
PBMCsarc2=sc.read_10x_mtx('/raid02/Data-live/tjana/LIB5455301_SAM24412252/outs/filtered_feature_bc_matrix/',
var_names='gene_symbols',cache=True)
#Load Disease PBMC dataset1 for sarcoidosis
PBMCsarc3=sc.read_10x_mtx('/raid02/Data-live/tjana/LIB5455303_SAM24412254/outs/filtered_feature_bc_matrix/',
var_names='gene_symbols',cache=True)
#load Healthy PBMC Control1 from 10X library
PBMChealthy1=sc.read_10x_mtx('/raid02/Data-live/tjana/multi/SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K_h1/outs/per_sample_outs/PBMCs_human_2/count/sample_filtered_feature_bc_matrix/',
var_names='gene_symbols',cache=True)
#load Healthy PBMC Control2 from 10X library
PBMChealthy2=sc.read_10x_mtx('/raid02/Data-live/tjana/5k_pbmc_v3_nextgem_fastqs_h2/outs/filtered_feature_bc_matrix/',
var_names='gene_symbols',cache=True)
#load Healthy PBMC Control3 from 10X library
PBMChealthy3=sc.read_10x_mtx('/raid02/Data-live/tjana/3p_Citrate_CPT_fastqs_h3/outs/filtered_feature_bc_matrix/',
var_names='gene_symbols',cache=True)
#load Healthy PBMC Control4 from 10X library
PBMChealthy4=sc.read_10x_mtx('/raid02/Data-live/tjana/10k_PBMC_3p_nextgem_Chromium_X_fastqs_h4/outs/filtered_feature_bc_matrix/',
var_names='gene_symbols',cache=True)
... reading from cache file cache/raid02-Data-live-tjana-LIB5455299_SAM24412250-outs-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/raid02-Data-live-tjana-LIB5455301_SAM24412252-outs-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/raid02-Data-live-tjana-LIB5455303_SAM24412254-outs-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/raid02-Data-live-tjana-multi-SC3_v3_NextGem_DI_CellPlex_Human_PBMC_10K_h1-outs-per_sample_outs-PBMCs_human_2-count-sample_filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/raid02-Data-live-tjana-5k_pbmc_v3_nextgem_fastqs_h2-outs-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/raid02-Data-live-tjana-3p_Citrate_CPT_fastqs_h3-outs-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/raid02-Data-live-tjana-10k_PBMC_3p_nextgem_Chromium_X_fastqs_h4-outs-filtered_feature_bc_matrix-matrix.h5ad
Making all indexes into unique of all samples (Disease SARCOIDOSIS and Healthy)
#Making all indexes into unique of all samples (Disease SARCOIDOSIS and Healthy)
#Sarcoidosis disease
PBMCsarc1.var_names_make_unique()
PBMCsarc2.var_names_make_unique()
PBMCsarc3.var_names_make_unique()
#Healthy/control
PBMChealthy1.var_names_make_unique()
PBMChealthy2.var_names_make_unique()
PBMChealthy3.var_names_make_unique()
PBMChealthy4.var_names_make_unique()
Adding some metadata for all PBMC samples
# Adding some metadata for all PBMC samples
PBMCsarc1.obs['type']="Sarcoidosis"
PBMCsarc1.obs['sample']="PBMC-Sarc-1"
PBMCsarc2.obs['type']="Sarcoidosis"
PBMCsarc2.obs['sample']="PBMC-Sarc-2"
PBMCsarc3.obs['type']="Sarcoidosis"
PBMCsarc3.obs['sample']="PBMC-Sarc-3"
PBMChealthy1.obs['type']="Healthy"
PBMChealthy1.obs['sample']="PBMC-healthy-1"
PBMChealthy2.obs['type']="Healthy"
PBMChealthy2.obs['sample']="PBMC-healthy-2"
PBMChealthy3.obs['type']="Healthy"
PBMChealthy3.obs['sample']="PBMC-healthy-3"
PBMChealthy4.obs['type']="Healthy"
PBMChealthy4.obs['sample']="PBMC-healthy-4"
#All samples are merged into one object named adata (anndata - Annotated data)
adata = PBMCsarc1.concatenate(PBMCsarc2, PBMCsarc3, PBMChealthy1, PBMChealthy2, PBMChealthy3, PBMChealthy4)
#adata = PBMCsarc1.concatenate(PBMCsarc2, PBMCsarc3, PBMChealthy1, PBMChealthy2, PBMChealthy3)
# Deleting individual datasets to save space
del(PBMCsarc1, PBMCsarc2, PBMCsarc3)
del(PBMChealthy1, PBMChealthy2, PBMChealthy3, PBMChealthy4)
#del(PBMChealthy1, PBMChealthy2, PBMChealthy3)
/home/jana/my-notebook-venv/lib/python3.8/site-packages/anndata/_core/anndata.py:1755: FutureWarning: The AnnData.concatenate method is deprecated in favour of the anndata.concat function. Please use anndata.concat instead. See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html
Displaying merged object observation and variables types
# Displaying merged object observation and variables types
print (adata)
print ("------###---")
#Displaying counts cells sample wise
display(adata.obs['sample'].value_counts())
#Displaying counts total cells counts of healthy and Sarcoid (Sarc)
print ("------###---")
display(adata.obs['type'].value_counts())
AnnData object with n_obs × n_vars = 53455 × 36601
obs: 'type', 'sample', 'batch'
var: 'gene_ids', 'feature_types'
------###---
sample PBMC-healthy-4 11999 PBMC-Sarc-2 10029 PBMC-Sarc-3 8754 PBMC-Sarc-1 7438 PBMC-healthy-1 6093 PBMC-healthy-2 5184 PBMC-healthy-3 3958 Name: count, dtype: int64
------###---
type Healthy 27234 Sarcoidosis 26221 Name: count, dtype: int64
# Print merged adata var (variable) types
display (adata.var)
| gene_ids | feature_types | |
|---|---|---|
| MIR1302-2HG | ENSG00000243485 | Gene Expression |
| FAM138A | ENSG00000237613 | Gene Expression |
| OR4F5 | ENSG00000186092 | Gene Expression |
| AL627309.1 | ENSG00000238009 | Gene Expression |
| AL627309.3 | ENSG00000239945 | Gene Expression |
| ... | ... | ... |
| AC141272.1 | ENSG00000277836 | Gene Expression |
| AC023491.2 | ENSG00000278633 | Gene Expression |
| AC007325.1 | ENSG00000276017 | Gene Expression |
| AC007325.4 | ENSG00000278817 | Gene Expression |
| AC007325.2 | ENSG00000277196 | Gene Expression |
36601 rows × 2 columns
# Print merged adata obs observation types
display (adata.obs)
| type | sample | batch | |
|---|---|---|---|
| AAACCCAAGACATAAC-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0 |
| AAACCCAAGAGGCGGA-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0 |
| AAACCCAAGCGTACAG-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0 |
| AAACCCAAGGTACAAT-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0 |
| AAACCCACAGCGTACC-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0 |
| ... | ... | ... | ... |
| TTTGTTGGTTGGATCT-1-6 | Healthy | PBMC-healthy-4 | 6 |
| TTTGTTGGTTTCTTAC-1-6 | Healthy | PBMC-healthy-4 | 6 |
| TTTGTTGTCCATTTCA-1-6 | Healthy | PBMC-healthy-4 | 6 |
| TTTGTTGTCTACACAG-1-6 | Healthy | PBMC-healthy-4 | 6 |
| TTTGTTGTCTCATTAC-1-6 | Healthy | PBMC-healthy-4 | 6 |
53455 rows × 3 columns
# Identification of mitochondrial genes by giving a pattern 'MT-'
adata.var['mt'] = adata.var_names.str.startswith('MT-')
# Identification ribosomal genes by giving a pattern 'RPS/RPL'
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))
# Identification hemoglobin genes by giving a pattern ^HB[^(P)]
adata.var['hb'] = adata.var_names.str.contains(("^HB[^(P)]"))
# Print merged adata varibale
display ("adata variables types including genes_id, feature types, mt, ribo and hb")
'adata variables types including genes_id, feature types, mt, ribo and hb'
#Calculating the QC metrices of merged object adata
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo','hb'], percent_top=None, log1p=False, inplace=True)
#Now you can see that we have additional data in the scanpy obs slot.
print("Computing cell compute fraction of counts in mito genes vs. all genes")
mito_genes = adata.var_names.str.startswith('MT-')
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mt2'] = np.sum(
adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
# Adding the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
print("Total mito genes", sum(mito_genes))
Computing cell compute fraction of counts in mito genes vs. all genes Total mito genes 13
#Now you can see that we have additional data in the scanpy obs slot.
print("Computing cell compute fraction of counts in ribo_genes")
ribo_genes = adata.var_names.str.startswith(("RPS","RPL"))
adata.obs['percent_ribo'] = np.sum(
adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
print("Total ribo_genes", sum(ribo_genes))
Computing cell compute fraction of counts in ribo_genes Total ribo_genes 103
adata
AnnData object with n_obs × n_vars = 53455 × 36601
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'percent_ribo'
var: 'gene_ids', 'feature_types', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
Plot of Pre QC metrices
#Violin plot of whole QC metrices of merged object in samples wise
print ("Plot of Pre QC metrices")
sc.pl.violin(adata, ['n_counts', 'n_genes_by_counts', 'pct_counts_mt','pct_counts_ribo', 'pct_counts_hb'],
jitter=0.4, groupby = 'sample', rotation = 90)
... storing 'type' as categorical ... storing 'sample' as categorical ... storing 'feature_types' as categorical
Plot of Pre QC metrices
Scatter plot: (total counts vs. pct_counts_mt) & (total counts vs n_genes_by_counts): sample wise
#Scatter plot: (total counts vs. pct_counts_mt) & (total counts vs n_genes_by_counts): sample wise
sc.pl.scatter(adata, x='n_counts', y='pct_counts_mt', color="sample")
sc.pl.scatter(adata, x='n_counts', y='n_genes_by_counts', color="sample")
sc.pl.scatter(adata, x='n_counts', y='percent_ribo', color="sample")
sc.pl.scatter(adata, x='pct_counts_mt', y='percent_ribo', color="sample")
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:490: MatplotlibDeprecationWarning: The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use legend_handles instead.
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:490: MatplotlibDeprecationWarning: The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use legend_handles instead.
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:490: MatplotlibDeprecationWarning: The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use legend_handles instead.
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:490: MatplotlibDeprecationWarning: The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use legend_handles instead.
Basic Filtering with minimum number of cells and genes
# Basic Filtering with minimum number of cells and genes
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
# Displaying Number of observations and Number of variables/features
print ("adata contains observations and variables")
display(adata.n_obs, adata.n_vars)
filtered out 891 cells that have less than 200 genes expressed filtered out 6735 genes that are detected in less than 3 cells adata contains observations and variables
52564
29866
def gene_detection_filter(adata, sample, max_genes=6000):
return (adata.obs['n_genes_by_counts'] < max_genes) & (adata.obs['sample'] == sample)
# Filter for gene detection for PBMC-Sarc-1
keep_Sarc1 = gene_detection_filter(adata, 'PBMC-Sarc-1')
# Filter for gene detection for PBMC-Sarc-2
keep_Sarc2 = gene_detection_filter(adata, 'PBMC-Sarc-2')
# Filter for gene detection for PBMC-Sarc-3
keep_Sarc3 = gene_detection_filter(adata, 'PBMC-Sarc-3')
# Filter for gene detection for PBMC-healthy-1
keep_healthy1 = gene_detection_filter(adata, 'PBMC-healthy-1')
# Filter for gene detection for PBMC-healthy-2
keep_healthy2 = gene_detection_filter(adata, 'PBMC-healthy-2')
# Filter for gene detection for PBMC-healthy-3
keep_healthy3 = gene_detection_filter(adata, 'PBMC-healthy-3')
# Filter for gene detection for PBMC-healthy-4
keep_healthy4 = gene_detection_filter(adata, 'PBMC-healthy-4')
# Keep both sets of cells
keep = (keep_Sarc1) | (keep_Sarc2) | (keep_Sarc3) | (keep_healthy1) | (keep_healthy2) | (keep_healthy3) | (keep_healthy4)
print(sum(keep))
# Update adata with the filtered cells
adata = adata[keep, :]
print("Remaining cells %d" % adata.n_obs)
51979 Remaining cells 51979
Fixing QC cutoff for n_genes_by_counts, pct_counts_mt, pct_counts_ribo and total_counts
#Displaying counts cells sample wise
display(adata.obs['sample'].value_counts())
sample PBMC-healthy-4 11648 PBMC-Sarc-2 9932 PBMC-Sarc-3 8562 PBMC-Sarc-1 7051 PBMC-healthy-1 5972 PBMC-healthy-2 5002 PBMC-healthy-3 3812 Name: count, dtype: int64
# filter for percent mito
adata = adata[adata.obs['pct_counts_mt'] < 15, :]
#Displaying counts cells sample wise
display(adata.obs['sample'].value_counts())
sample PBMC-healthy-4 11514 PBMC-Sarc-2 9782 PBMC-Sarc-3 8371 PBMC-Sarc-1 6889 PBMC-healthy-1 5839 PBMC-healthy-2 4751 PBMC-healthy-3 3732 Name: count, dtype: int64
# filter for pct_counts_ribo for PBMC-Sarc-1
def ribo_filter(adata, sample, max_pct_counts_ribo):
return (adata.obs['pct_counts_ribo'] < max_pct_counts_ribo) & (adata.obs['sample'] == sample)
# Filter for pct_counts_ribo for PBMC-Sarc-1
keep_rb_Sarc_1 = ribo_filter(adata, 'PBMC-Sarc-1', 60)
# Filter for pct_counts_ribo for PBMC-Sarc-2
keep_rb_Sarc_2 = ribo_filter(adata, 'PBMC-Sarc-2', 60)
# Filter for pct_counts_ribo for PBMC-Sarc-3
keep_rb_Sarc_3 = ribo_filter(adata, 'PBMC-Sarc-3', 60)
# Filter for pct_counts_ribo for PBMC-healthy-1
keep_rb_PBMC_healthy1 = ribo_filter(adata, 'PBMC-healthy-1', 40)
# Filter for pct_counts_ribo for PBMC-healthy-2
keep_rb_PBMC_healthy2 = ribo_filter(adata, 'PBMC-healthy-2', 40)
# Filter for pct_counts_ribo for PBMC-healthy-3
keep_rb_PBMC_healthy3 = ribo_filter(adata, 'PBMC-healthy-3', 40)
# Filter for pct_counts_ribo for PBMC-healthy-4
keep_rb_PBMC_healthy4 = ribo_filter(adata, 'PBMC-healthy-4', 40)
# Keep both sets of cells
keep_rb = (keep_rb_Sarc_1) | (keep_rb_Sarc_2) | (keep_rb_Sarc_3) | (keep_rb_PBMC_healthy1) | (keep_rb_PBMC_healthy2) | (keep_rb_PBMC_healthy3) | (keep_rb_PBMC_healthy4)
print(sum(keep_rb))
# Update adata with the filtered cells
adata = adata[keep_rb, :]
print("Remaining cells %d" % adata.n_obs)
50481 Remaining cells 50481
#Displaying counts cells sample wise
display(adata.obs['sample'].value_counts())
sample PBMC-healthy-4 11300 PBMC-Sarc-2 9782 PBMC-Sarc-3 8355 PBMC-Sarc-1 6888 PBMC-healthy-1 5684 PBMC-healthy-2 4750 PBMC-healthy-3 3722 Name: count, dtype: int64
def ncounts_filter(adata, sample, max_n_counts):
return (adata.obs['n_counts'] < max_n_counts) & (adata.obs['sample'] == sample)
# Filter for n_counts for PBMC-Sarc-1
keep_ncount_Sarc1 = ncounts_filter(adata, 'PBMC-Sarc-1', 20000)
# Filter for n_counts for PBMC-Sarc-2
keep_ncount_Sarc2 = ncounts_filter(adata, 'PBMC-Sarc-2', 30000)
# Filter for n_counts for PBMC-Sarc-3
keep_ncount_Sarc3 = ncounts_filter(adata, 'PBMC-Sarc-3', 20000)
# Filter for n_counts for PBMC-healthy-1
keep_ncount_healthy1 = ncounts_filter(adata, 'PBMC-healthy-1', 40000)
# Filter for n_counts for PBMC-healthy-2
keep_ncount_healthy2 = ncounts_filter(adata, 'PBMC-healthy-2', 60000)
# Filter for n_counts for PBMC-healthy-3
keep_ncount_healthy3 = ncounts_filter(adata, 'PBMC-healthy-3', 40000)
# Filter for n_counts for PBMC-healthy-4
keep_ncount_healthy4 = ncounts_filter(adata, 'PBMC-healthy-4', 50000)
# Keep both sets of cells
keep_ncount = (keep_ncount_Sarc1) | (keep_ncount_Sarc2) | (keep_ncount_Sarc3) | (keep_ncount_healthy1) | (keep_ncount_healthy2) | (keep_ncount_healthy3) | (keep_ncount_healthy4)
print(sum(keep_ncount))
# Update adata with the filtered cells
adata = adata[keep_ncount, :]
print("Remaining cells %d" % adata.n_obs)
50431 Remaining cells 50431
#Displaying counts cells sample wise
display(adata.obs['sample'].value_counts())
sample PBMC-healthy-4 11300 PBMC-Sarc-2 9778 PBMC-Sarc-3 8350 PBMC-Sarc-1 6848 PBMC-healthy-1 5683 PBMC-healthy-2 4750 PBMC-healthy-3 3722 Name: count, dtype: int64
Final QC cutoffs plot
#After fitering mito and ribo part, the Violin plot of whole QC metrices of merged object in samples wise
print ("final QC cutoffs for post processing")
sc.pl.violin(adata, ['n_counts', 'n_genes_by_counts', 'pct_counts_mt','pct_counts_ribo', 'pct_counts_hb'],
jitter=0.4, groupby = 'sample', rotation = 90)
final QC cutoffs for post processing
post QC cutoff chosen Top 20 Highest expressed genes
#Post QC cutoff chosen top 20 Highest expressing genes
print ("Highest expressed genes post QC cutoff chosen")
sc.pl.highest_expr_genes(adata, n_top=20)
Highest expressed genes post QC cutoff chosen
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy.
normalizing counts per cell
finished (0:00:01)
Removing the unwanted genes including mito_genes, hb_genes
# we need to redefine the mito_genes, hb_genes, since they were first
# calculated on the full object before removing low expressed genes.
print ("Removing some unwanted genes named MALAT1")
malat1 = adata.var_names.str.startswith('MALAT1')
mito_genes = adata.var_names.str.startswith('MT-')
hb_genes = adata.var_names.str.contains('^HB[^(P)]')
rb_gene1 = adata.var_names.str.startswith('RPS')
rb_gene2 = adata.var_names.str.startswith('RPL')
remove = np.add(mito_genes, malat1)
remove = np.add(remove, hb_genes)
remove = np.add(remove, rb_gene1)
remove = np.add(remove, rb_gene2)
keep_updated = np.invert(remove)
adata = adata[:,keep_updated]
print(adata.n_obs, adata.n_vars)
Removing some unwanted genes named MALAT1 50431 29738
The Top 20 Highest expressing genes after removing some unwanted genes
#After removing some unwanted genes top 20 Highest expressing genes
print ("After removing some unwanted genes, current top 20 highest expressed genes")
sc.pl.highest_expr_genes(adata, n_top=20)
After removing some unwanted genes, current top 20 highest expressed genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/preprocessing/_normalization.py:170: UserWarning: Received a view of an AnnData. Making a copy.
normalizing counts per cell
finished (0:00:01)
import numpy as np
import scanpy as sc
def get_biomart_annotations(species, gene_info):
return sc.queries.biomart_annotations(species, gene_info).set_index("external_gene_name")
def chromosomeY_adjustment_step1(adata, species="hsapiens", gene_info=["ensembl_gene_id", "external_gene_name", "start_position", "end_position", "chromosome_name"]):
annot = get_biomart_annotations(species, gene_info)
chrY_genes = adata.var_names.intersection(annot.index[annot.chromosome_name == "Y"])
return chrY_genes
def calculate_percent_chrY(adata, chrY_genes):
adata.obs['percent_chrY'] = np.sum(adata[:, chrY_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1 * 100
def add_XIST_expression_to_obs(adata):
adata.obs["XIST-counts"] = adata.X[:, adata.var_names.str.match('XIST')].toarray()
def plot_scatter_XIST_percent_chrY(adata):
sc.pl.scatter(adata, x='XIST-counts', y='percent_chrY', color="sample")
def plot_violin_XIST_percent_chrY(adata):
sc.pl.violin(adata, ["XIST-counts", "percent_chrY"], jitter=0.4, groupby='sample', rotation=90)
# Example usage
chrY_genes = chromosomeY_adjustment_step1(adata)
calculate_percent_chrY(adata, chrY_genes)
add_XIST_expression_to_obs(adata)
plot_scatter_XIST_percent_chrY(adata)
plot_violin_XIST_percent_chrY(adata)
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:490: MatplotlibDeprecationWarning: The legendHandles attribute was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use legend_handles instead.
# adata raw is assigned with adata for post processing
adata.raw = adata
#Doublet detetection
print ("Doublet detection")
import scrublet as scr
# split per batch into new objects.
batches = adata.obs['sample'].cat.categories.tolist()
alldata = {}
for batch in batches:
tmp = adata[adata.obs['sample'] == batch,]
print(batch, ":", tmp.shape[0], " cells")
scrub = scr.Scrublet(tmp.raw.X, expected_doublet_rate = 0.06)
out = scrub.scrub_doublets(verbose=False, min_counts=2, min_cells=3,min_gene_variability_pctl=85,n_prin_comps=20)
alldata[batch] = pd.DataFrame({'doublet_score':out[0],'predicted_doublets':out[1]},index = tmp.obs.index)
print(alldata[batch].predicted_doublets.sum(), " predicted_doublets")
print(round(alldata[batch].predicted_doublets.sum()/tmp.shape[0]*100,2), " predicted doublet Percentage\n")
Doublet detection PBMC-Sarc-1 : 6848 cells 321 predicted_doublets 4.69 predicted doublet Percentage PBMC-Sarc-2 : 9778 cells 665 predicted_doublets 6.8 predicted doublet Percentage PBMC-Sarc-3 : 8350 cells 344 predicted_doublets 4.12 predicted doublet Percentage PBMC-healthy-1 : 5683 cells 126 predicted_doublets 2.22 predicted doublet Percentage PBMC-healthy-2 : 4750 cells 74 predicted_doublets 1.56 predicted doublet Percentage PBMC-healthy-3 : 3722 cells 88 predicted_doublets 2.36 predicted doublet Percentage PBMC-healthy-4 : 11300 cells 422 predicted_doublets 3.73 predicted doublet Percentage
#Histogram plot doublet detection
scrub.plot_histogram();
# Doublet detector predictions adding to the adata object.
scrub_pred = pd.concat(alldata.values())
adata.obs['doublet_scores'] = scrub_pred['doublet_score']
adata.obs['predicted_doublets'] = scrub_pred['predicted_doublets']
sum(adata.obs['predicted_doublets'])
2040
# add in column with singlet/doublet instead of True/Fals
%matplotlib inline
adata.obs['doublet_info'] = adata.obs["predicted_doublets"].astype(str)
sc.pl.violin(adata, 'n_genes_by_counts',
jitter=0.4, groupby = 'doublet_info', rotation=45)
... storing 'doublet_info' as categorical
Standard Pipeline appiled
#sandard Pipeline used here
print ("Standard pipeline SC RNA seq begins")
print ("Step1 of pipeline: Normalize the data")
sc.pp.normalize_total(adata, target_sum=1e4)
Standard pipeline SC RNA seq begins
Step1 of pipeline: Normalize the data
normalizing counts per cell
finished (0:00:00)
print ("Step2 of pipeline: Logarithmize the data")
#Logarithmize the data
sc.pp.log1p(adata)
Step2 of pipeline: Logarithmize the data
print ("Step3 of pipeline: Identify highly-variable genes")
#Identify highly-variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
#displaying Highly variable genes before normalize and after normalization
sc.pl.highly_variable_genes(adata,)
Step3 of pipeline: Identify highly-variable genes
extracting highly variable genes
finished (0:00:04)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
print ("Step4 of of pipeline: Filtering highly variable genes")
#Getting back to an AnnData of the object in .raw by calling .raw.to_adata().
adata.raw = adata
#filtering highly variable genes
adata = adata[:, adata.var.highly_variable]
Step4 of of pipeline: Filtering highly variable genes
print ("Step5 of of pipeline: Regressing out pct_counts_mt, total_counts")
#Regressing out effects of total counts per cell and the percentage of mitochondrial genes expressed
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
Step5 of of pipeline: Regressing out pct_counts_mt, total_counts
regressing out ['total_counts', 'pct_counts_mt']
sparse input is densified and may lead to high memory use
finished (0:07:01)
Step6 of pipeline: Scaling
#print ("Step6 of pipeline: Scaling the adata")
#Scale the data to unit variance and Clip values exceeding standard deviation 10
sc.pp.scale(adata, max_value=10)
principal component analysis (PCA) analysis
#print ("Step7 of pipeline: principal component analysis (PCA)")
print ("computing PCA with taking temp adata that does not effect main adata object")
#Reduce the dimensionality of the data by running principal component analysis (PCA). Denoising the data.
# Computing the neighborhood graph
# Neighborhood graph of cells using the PCA representation of the data matrix computation.
# Copy adata not to modify UMAP in the original adata object
adata_temp=adata.copy()
for n_pcs in range(1, 52): # Adjust the range accordingly
sc.tl.pca(adata_temp, n_comps=n_pcs, svd_solver='arpack') # SVD:singular value decomposition of a sparse matrix using SciPy ARPACK wrapper
# Plot the explained variance ratio for each PC
plt.plot(range(1, n_pcs+1), adata_temp.uns['pca']['variance_ratio'], marker='o')
plt.xlabel('Number of PCs')
plt.ylabel('Explained Variance Ratio')
plt.title('Explained Variance Ratio for Each PC')
plt.show()
# Assuming 'adata' is your AnnData object
#delete temporary adata_temp
del adata_temp
computing PCA with taking temp adata that does not effect main adata object
computing PCA
on highly variable genes
with n_comps=1
finished (0:00:04)
computing PCA
on highly variable genes
with n_comps=2
finished (0:00:04)
computing PCA
on highly variable genes
with n_comps=3
finished (0:00:04)
computing PCA
on highly variable genes
with n_comps=4
finished (0:00:04)
computing PCA
on highly variable genes
with n_comps=5
finished (0:00:05)
computing PCA
on highly variable genes
with n_comps=6
finished (0:00:06)
computing PCA
on highly variable genes
with n_comps=7
finished (0:00:05)
computing PCA
on highly variable genes
with n_comps=8
finished (0:00:05)
computing PCA
on highly variable genes
with n_comps=9
finished (0:00:06)
computing PCA
on highly variable genes
with n_comps=10
finished (0:00:06)
computing PCA
on highly variable genes
with n_comps=11
finished (0:00:06)
computing PCA
on highly variable genes
with n_comps=12
finished (0:00:06)
computing PCA
on highly variable genes
with n_comps=13
finished (0:00:07)
computing PCA
on highly variable genes
with n_comps=14
finished (0:00:07)
computing PCA
on highly variable genes
with n_comps=15
finished (0:00:07)
computing PCA
on highly variable genes
with n_comps=16
finished (0:00:08)
computing PCA
on highly variable genes
with n_comps=17
finished (0:00:08)
computing PCA
on highly variable genes
with n_comps=18
finished (0:00:07)
computing PCA
on highly variable genes
with n_comps=19
finished (0:00:07)
computing PCA
on highly variable genes
with n_comps=20
finished (0:00:08)
computing PCA
on highly variable genes
with n_comps=21
finished (0:00:09)
computing PCA
on highly variable genes
with n_comps=22
finished (0:00:10)
computing PCA
on highly variable genes
with n_comps=23
finished (0:00:08)
computing PCA
on highly variable genes
with n_comps=24
finished (0:00:09)
computing PCA
on highly variable genes
with n_comps=25
finished (0:00:11)
computing PCA
on highly variable genes
with n_comps=26
finished (0:00:10)
computing PCA
on highly variable genes
with n_comps=27
finished (0:00:11)
computing PCA
on highly variable genes
with n_comps=28
finished (0:00:11)
computing PCA
on highly variable genes
with n_comps=29
finished (0:00:11)
computing PCA
on highly variable genes
with n_comps=30
finished (0:00:12)
computing PCA
on highly variable genes
with n_comps=31
finished (0:00:12)
computing PCA
on highly variable genes
with n_comps=32
finished (0:00:12)
computing PCA
on highly variable genes
with n_comps=33
finished (0:00:13)
computing PCA
on highly variable genes
with n_comps=34
finished (0:00:13)
computing PCA
on highly variable genes
with n_comps=35
finished (0:00:13)
computing PCA
on highly variable genes
with n_comps=36
finished (0:00:14)
computing PCA
on highly variable genes
with n_comps=37
finished (0:00:14)
computing PCA
on highly variable genes
with n_comps=38
finished (0:00:14)
computing PCA
on highly variable genes
with n_comps=39
finished (0:00:15)
computing PCA
on highly variable genes
with n_comps=40
finished (0:00:15)
computing PCA
on highly variable genes
with n_comps=41
finished (0:00:16)
computing PCA
on highly variable genes
with n_comps=42
finished (0:00:15)
computing PCA
on highly variable genes
with n_comps=43
finished (0:00:16)
computing PCA
on highly variable genes
with n_comps=44
finished (0:00:17)
computing PCA
on highly variable genes
with n_comps=45
finished (0:00:18)
computing PCA
on highly variable genes
with n_comps=46
finished (0:00:16)
computing PCA
on highly variable genes
with n_comps=47
finished (0:00:17)
computing PCA
on highly variable genes
with n_comps=48
finished (0:00:18)
computing PCA
on highly variable genes
with n_comps=49
finished (0:00:17)
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:17)
computing PCA
on highly variable genes
with n_comps=51
finished (0:00:19)
#print ("Step7 of pipeline remaining part: continuation principal component analysis (PCA)")
sc.pp.pca(adata, svd_solver='arpack', n_comps=20) #first 20 PCs
#scatter plot generation in the PCA coordinates
sc.pl.pca(adata)
#scatter plot generation in the PCA coordinates, with 'CD14', 'CD79A','CD3D', 'FCER1A','NKG7' and 'CST3'
sc.pl.pca(adata, color= ['CD14', 'CD79A','CD3D', 'FCER1A','NKG7','CST3'])
print("CD14: CD14+ Monocytes, CD79A: B cell, CD3D : CD4+ T cell, FCER1A: CD16+ Monocyte, NKG7: NK cell, CST3: Dendritic cells")
computing PCA
on highly variable genes
with n_comps=20
finished (0:00:08)
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
CD14: CD14+ Monocytes, CD79A: B cell, CD3D : CD4+ T cell, FCER1A: CD16+ Monocyte, NKG7: NK cell, CST3: Dendritic cells
Cell cycle score computing
#Cell cycle score computing step1
cell_cycle_genes = [x.strip() for x in open('./data/regev_lab_cell_cycle_genes.txt')]
print(len(cell_cycle_genes))
# Split into 2 lists
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
print(len(cell_cycle_genes))
97 42
/tmp/ipykernel_2194019/3397686964.py:3: ResourceWarning: unclosed file <_io.TextIOWrapper name='./data/regev_lab_cell_cycle_genes.txt' mode='r' encoding='UTF-8'>
#Cell cycle score computing step2
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
finished: added
'S_score', score of gene set (adata.obs).
643 total control genes are used. (0:00:03)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
finished: added
'G2M_score', score of gene set (adata.obs).
686 total control genes are used. (0:00:03)
--> 'phase', cell cycle phase (adata.obs)
#Cell cycle score computing step3
adata_cc_genes = adata[:, cell_cycle_genes]
sc.tl.pca(adata_cc_genes, svd_solver='arpack', n_comps=20)
sc.pl.violin(adata, ['S_score', 'G2M_score'],
jitter=0.4, groupby = 'sample', rotation=90)
computing PCA
on highly variable genes
with n_comps=20
finished (0:00:01)
... storing 'phase' as categorical
#Cell cycle score computing step4
#regressing out cell S_score and G2M_score
#As per Seurat recommendations, regressing out the difference between the G2M and S phase scores.
#This means that signals separating non-cycling cells and cycling cells will be maintained.
sc.pp.regress_out(adata, ['S_score', 'G2M_score'])
sc.pp.scale(adata)
regressing out ['S_score', 'G2M_score']
finished (0:04:50)
#Data Intergation and BatchCorrection is done using Harmony algorithm
print ("Data Intergation and Batch correction using Harmony algorithm")
sce.pp.harmony_integrate(adata, 'sample')
2024-01-18 12:43:32,489 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans... INFO:harmonypy:Computing initial centroids with sklearn.KMeans...
Data Intergation and Batch correction using Harmony algorithm
2024-01-18 12:43:47,819 - harmonypy - INFO - sklearn.KMeans initialization complete. INFO:harmonypy:sklearn.KMeans initialization complete. 2024-01-18 12:43:48,625 - harmonypy - INFO - Iteration 1 of 10 INFO:harmonypy:Iteration 1 of 10 2024-01-18 12:44:35,311 - harmonypy - INFO - Iteration 2 of 10 INFO:harmonypy:Iteration 2 of 10 2024-01-18 12:45:20,320 - harmonypy - INFO - Iteration 3 of 10 INFO:harmonypy:Iteration 3 of 10 2024-01-18 12:46:05,105 - harmonypy - INFO - Iteration 4 of 10 INFO:harmonypy:Iteration 4 of 10 2024-01-18 12:46:50,587 - harmonypy - INFO - Iteration 5 of 10 INFO:harmonypy:Iteration 5 of 10 2024-01-18 12:47:36,002 - harmonypy - INFO - Iteration 6 of 10 INFO:harmonypy:Iteration 6 of 10 2024-01-18 12:48:22,203 - harmonypy - INFO - Iteration 7 of 10 INFO:harmonypy:Iteration 7 of 10 2024-01-18 12:49:06,773 - harmonypy - INFO - Iteration 8 of 10 INFO:harmonypy:Iteration 8 of 10 2024-01-18 12:49:51,583 - harmonypy - INFO - Converged after 8 iterations INFO:harmonypy:Converged after 8 iterations
Post proccssing of Harmonization:
def post_process_harmonization(adata):
print("Post-processing of Harmonization")
# Set current PCA to be aligned with Harmonized PCA values
adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']
# Compute neighborhood graphs and clustering
print("Computing neighborhood graphs and Clustering")
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=20)
# Cluster the neighborhood graph using Leiden Clustering algorithm
sc.tl.leiden(adata)
sc.tl.umap(adata)
# Example usage
post_process_harmonization(adata)
Post-processing of Harmonization
Computing neighborhood graphs and Clustering
computing neighbors
using 'X_pca' with n_pcs = 20
/home/jana/my-notebook-venv/lib/python3.8/site-packages/numba/core/typed_passes.py:334: NumbaPerformanceWarning:
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.
To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.
File "my-notebook-venv/lib/python3.8/site-packages/umap/rp_tree.py", line 135:
@numba.njit(fastmath=True, nogil=True, parallel=True)
def euclidean_random_projection_split(data, indices, rng_state):
^
/home/jana/my-notebook-venv/lib/python3.8/site-packages/umap/nndescent.py:91: NumbaPerformanceWarning:
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.
To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.
File "my-notebook-venv/lib/python3.8/site-packages/umap/utils.py", line 409:
@numba.njit(parallel=True)
def build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
^
/home/jana/my-notebook-venv/lib/python3.8/site-packages/numba/core/typed_passes.py:334: NumbaPerformanceWarning:
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.
To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.
File "my-notebook-venv/lib/python3.8/site-packages/umap/nndescent.py", line 47:
@numba.njit(parallel=True)
def nn_descent(
^
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:20)
running Leiden clustering
finished: found 26 clusters and added
'leiden', the cluster labels (adata.obs, categorical) (0:00:21)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:01:03)
#UMAP cluster view Sample wise and cluster wise
sc.pl.umap(adata, color=['sample', 'leiden','S_score','G2M_score'])
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
#Displaying Doublet scores, Doublet info and Sample wise distrubtion
print ("Doublet finding plots with scores and info across the samples")
sc.pl.umap(adata, color=['doublet_scores','doublet_info','sample','G2M_score','S_score'])
Doublet finding plots with scores and info across the samples
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
#Doublet removing and Rest samples for post processing
print ("Considering the False Doublet finding information, means we are considering non doublet portion for the rest of tha analysis")
# also revert back to the raw counts as the main matrix in adata
adata = adata.raw.to_adata()
adata = adata[adata.obs['doublet_info'] == 'False',:]
print ("Current shape of the data")
print(adata.shape)
Considering the False Doublet finding information, means we are considering non doublet portion for the rest of tha analysis Current shape of the data (48391, 29738)
#UMAP cluster view Sample wise and cluster wise
sc.pl.umap(adata, color=['sample', 'leiden','S_score','G2M_score'], legend_loc="on data")
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
#display merged object
display(adata)
View of AnnData object with n_obs × n_vars = 48391 × 29738
obs: 'type', 'sample', 'batch', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'percent_mt2', 'n_counts', 'percent_ribo', 'n_genes', 'percent_chrY', 'XIST-counts', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'S_score', 'G2M_score', 'phase', 'leiden'
var: 'gene_ids', 'feature_types', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'sample_colors', 'doublet_info_colors', 'log1p', 'hvg', 'pca', 'neighbors', 'leiden', 'umap', 'leiden_colors'
obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
obsp: 'distances', 'connectivities'
# Print merged adata var (variable) types
display(adata.var, adata.obs)
| gene_ids | feature_types | mt | ribo | hb | n_cells_by_counts | mean_counts | pct_dropout_by_counts | total_counts | n_cells | highly_variable | means | dispersions | dispersions_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AL627309.1 | ENSG00000238009 | Gene Expression | False | False | False | 285 | 0.005500 | 99.466841 | 294.0 | 285 | False | 0.008142 | 1.174038 | 0.001578 |
| AL627309.3 | ENSG00000239945 | Gene Expression | False | False | False | 12 | 0.000224 | 99.977551 | 12.0 | 12 | False | 0.000357 | 0.758448 | -0.732573 |
| AL627309.5 | ENSG00000241860 | Gene Expression | False | False | False | 1824 | 0.036236 | 96.587784 | 1937.0 | 1824 | False | 0.059170 | 1.330373 | 0.277746 |
| AL627309.4 | ENSG00000241599 | Gene Expression | False | False | False | 20 | 0.000393 | 99.962585 | 21.0 | 20 | False | 0.000535 | 0.786116 | -0.683696 |
| AL669831.2 | ENSG00000229905 | Gene Expression | False | False | False | 14 | 0.000262 | 99.973810 | 14.0 | 14 | False | 0.000584 | 1.921767 | 1.322459 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| AC240274.1 | ENSG00000271254 | Gene Expression | False | False | False | 1116 | 0.022767 | 97.912263 | 1217.0 | 1116 | False | 0.033384 | 1.178559 | 0.009564 |
| AC004556.3 | ENSG00000276345 | Gene Expression | False | False | False | 2310 | 0.047236 | 95.678608 | 2525.0 | 2310 | False | 0.063124 | 1.066135 | -0.189035 |
| AC233755.2 | ENSG00000277856 | Gene Expression | False | False | False | 23 | 0.000449 | 99.956973 | 24.0 | 23 | False | 0.002977 | 2.203188 | 1.819595 |
| AC233755.1 | ENSG00000275063 | Gene Expression | False | False | False | 15 | 0.000299 | 99.971939 | 16.0 | 15 | False | 0.001805 | 2.304728 | 1.998968 |
| AC007325.4 | ENSG00000278817 | Gene Expression | False | False | False | 507 | 0.009578 | 99.051539 | 512.0 | 507 | False | 0.015333 | 1.136737 | -0.064315 |
29738 rows × 14 columns
| type | sample | batch | n_genes_by_counts | total_counts | total_counts_mt | pct_counts_mt | total_counts_ribo | pct_counts_ribo | total_counts_hb | ... | n_genes | percent_chrY | XIST-counts | doublet_scores | predicted_doublets | doublet_info | S_score | G2M_score | phase | leiden | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAACCCAAGACATAAC-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0 | 385 | 585.0 | 27.0 | 4.615385 | 32.0 | 5.470086 | 1.0 | ... | 385 | 0.421941 | 0.0 | 0.033419 | False | False | -0.024070 | 0.009892 | G2M | 20 |
| AAACCCAAGAGGCGGA-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0 | 2191 | 5556.0 | 423.0 | 7.613391 | 613.0 | 11.033117 | 2.0 | ... | 2191 | 0.069140 | 0.0 | 0.103814 | False | False | 0.009555 | -0.058273 | S | 16 |
| AAACCCAAGCGTACAG-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0 | 936 | 2864.0 | 253.0 | 8.833798 | 1131.0 | 39.490223 | 0.0 | ... | 936 | 0.151860 | 0.0 | 0.015339 | False | False | -0.056477 | -0.057470 | G1 | 5 |
| AAACCCAAGGTACAAT-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0 | 3622 | 11581.0 | 736.0 | 6.355237 | 1679.0 | 14.497885 | 2.0 | ... | 3622 | 0.206540 | 0.0 | 0.048703 | False | False | 0.045412 | -0.026306 | S | 11 |
| AAACCCACAGCGTACC-1-0 | Sarcoidosis | PBMC-Sarc-1 | 0 | 2219 | 6849.0 | 536.0 | 7.825960 | 1114.0 | 16.265148 | 0.0 | ... | 2219 | 0.165529 | 0.0 | 0.012259 | False | False | -0.068371 | -0.071338 | G1 | 10 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TTTGTTGGTCGCAACC-1-6 | Healthy | PBMC-healthy-4 | 6 | 3587 | 10727.0 | 377.0 | 3.514496 | 864.0 | 8.054442 | 0.0 | ... | 3587 | 0.106929 | 3.0 | 0.042857 | False | False | -0.020565 | -0.092872 | G1 | 2 |
| TTTGTTGGTTGGATCT-1-6 | Healthy | PBMC-healthy-4 | 6 | 2795 | 9286.0 | 391.0 | 4.210640 | 2746.0 | 29.571400 | 0.0 | ... | 2795 | 0.248139 | 2.0 | 0.017654 | False | False | -0.106062 | -0.070131 | G1 | 7 |
| TTTGTTGGTTTCTTAC-1-6 | Healthy | PBMC-healthy-4 | 6 | 2891 | 6943.0 | 211.0 | 3.039032 | 952.0 | 13.711652 | 1.0 | ... | 2891 | 0.145243 | 1.0 | 0.006109 | False | False | -0.070861 | -0.058664 | G1 | 4 |
| TTTGTTGTCCATTTCA-1-6 | Healthy | PBMC-healthy-4 | 6 | 2539 | 7021.0 | 376.0 | 5.355362 | 1831.0 | 26.078907 | 0.0 | ... | 2539 | 0.267857 | 2.0 | 0.016736 | False | False | 0.016351 | -0.067255 | S | 1 |
| TTTGTTGTCTACACAG-1-6 | Healthy | PBMC-healthy-4 | 6 | 3581 | 9646.0 | 292.0 | 3.027162 | 637.0 | 6.603774 | 1.0 | ... | 3581 | 0.096235 | 1.0 | 0.004764 | False | False | -0.009492 | -0.091022 | G1 | 6 |
48391 rows × 24 columns
#Gaussian kernel density estimation is used to calculate the density of cells in an embedded space.
#This can be performed per category over a categorical cell annotation.
#Calcuting the density per sample
sc.tl.embedding_density(adata, groupby='sample')
#plot the density per sample
sc.pl.embedding_density(adata, groupby='sample')
computing density on 'umap'
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_embedding_density.py:169: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
--> added
'umap_density_sample', densities (adata.obs)
'umap_density_sample_params', parameter (adata.uns)
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/__init__.py:1450: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
#violin plot some Tcell markers inside adata sample wise
sc.pl.violin(adata, ['CD3D','CD8A'], groupby='sample', figsize=(3,1), gridspec_kw={'wspace':0.8}, rotation=90, alpha=0.8)
#import scipy io package
from scipy import io
save_file = '/home/jana/Updated_SCANPY_QC_filtered_PBMC_for_Sarcoid.h5ad'
adata.write_h5ad(save_file)